This vignette visualizes data normalization and QC.

1 Data preprocessing

The data preprocessing consists of the following steps:

This is done in an external python script. The data and all intermediate steps are stored in the corresponding hdf5 files in the PROMISE features directory.

cell_lines = unique(substr(list.files(featuresdir, pattern = "D0"), 1, 7))
features = setNames(
  object = vector(mode="list", length=length(cell_lines)), 
  nm = cell_lines)
features_metadata = setNames(
  object = vector(mode="list", length=length(cell_lines)), 
  nm = cell_lines)
for(cell_line in cell_lines) {
  cl_feature_fn = file.path(feature_dir, sprintf("%s_averaged_features_%s.h5", cell_line, feature_type))
  features[[cell_line]] = data.frame(h5read(cl_feature_fn, hdf5_dataset))
  colnames(features[[cell_line]]) = h5read(cl_feature_fn, "feature_names")
  rownames(features[[cell_line]]) = h5read(cl_feature_fn, "well_names")
  
  features_metadata[[cell_line]] = data.frame(
    "REPLICATE" = h5read(cl_feature_fn, "replicates"),
    "CONCENTRATION" = h5read(cl_feature_fn, "concentrations"),
    "DRUG" = h5read(cl_feature_fn, "drugs"), 
    row.names = rownames(features[[cell_line]]))
}
features = do.call(rbind, features)
rownames(features) = sapply(strsplit(rownames(features), ".", fixed = TRUE), "[[", 2)
features_metadata = do.call(rbind, features_metadata)
rownames(features_metadata) = sapply(strsplit(rownames(features_metadata), ".", fixed = TRUE), "[[", 2)

2 Quality Control

2.1 Comparing Positive controls

For this, I am only interested in the plates where the drug concentration is known, i.e. L08

layout_id = substr(rownames(features), 12, 14)
features = features[layout_id == "L08", ]
features_metadata = features_metadata[layout_id == "L08", ]

The drugs Bortezomib, Irinotecan / SN-38, Staurosporine, Volasertib, and Methotrexate are all potential positive controls. To see how suitable they are for this, I look at how they compare to DMSO, the negative control. To compare treatments, I look at the number of organoids as a proxy for organoid survival.

sf_drug = features_metadata[,"DRUG"]
sf_conc = features_metadata[,"CONCENTRATION"]
sf_drug_conc = ifelse(
  test = sf_drug %in% c("DMSO", "Staurosporine_500nM"), 
  yes = sf_drug, no = paste0(sf_drug, " @ ", sf_conc))
sf_num_organoids = features[,"organoids_num.of.objects"]
sf_avg_org_size = features[,"organoids_x.0.s.area_expected"]
sf_total_area = features[,"Total.Biomass"]

simple_features = data.frame(
  "Num.Objects" = sf_num_organoids,
  "Avg.Org.Size" = sf_avg_org_size,
  "Total.Area" = sf_total_area,
  "Drug" = sf_drug,
  "Concentration" = sf_conc,
  "Drug_Conc" = sf_drug_conc,
  "Cell.Line" = substr(rownames(features), 1, 7)
)

2.1.1 Staurosporine_500nM

Staurosporine and DMSO were only applied at one concentration.

ggplotdf = simple_features[simple_features$Drug %in% c("DMSO", "Staurosporine_500nM"),]
ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Num.Objects, fill = Drug_Conc)) + 
  ggtitle(label = "Number of Organoids per Well (Staurosporine_500nM)") + xlab("Cell Line") + 
  ylab("Number of Organoids")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Avg.Org.Size, fill = Drug_Conc)) + 
  ggtitle(label = "Average Are of Organoids (Staurosporine_500nM)") + 
  xlab("Cell Line") + ylab("Average Area")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Total.Area, fill = Drug_Conc)) + 
  ggtitle(label = "Total Biomass per Well (Staurosporine_500nM)") + 
  xlab("Cell Line") + ylab("Total Biomass Area")

2.1.2 Bortezomib

ggplotdf = simple_features[simple_features$Drug %in% c("DMSO", "Bortezomib"),]
ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Num.Objects, fill = Drug_Conc)) + 
  ggtitle(label = "Number of Organoids per Well (Bortezomib)") + xlab("Cell Line") + 
  ylab("Number of Organoids")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Avg.Org.Size, fill = Drug_Conc)) + 
  ggtitle(label = "Average Are of Organoids (Bortezomib)") + 
  xlab("Cell Line") + ylab("Average Area")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Total.Area, fill = Drug_Conc)) + 
  ggtitle(label = "Total Biomass per Well (Bortezomib)") + 
  xlab("Cell Line") + ylab("Total Biomass Area")

2.1.3 Irinotecan / SN-38

ggplotdf = simple_features[simple_features$Drug %in% c("DMSO", "Irinotecan / SN-38"),]
ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Num.Objects, fill = Drug_Conc)) + 
  ggtitle(label = "Number of Organoids per Well (Irinotecan / SN-38)") + xlab("Cell Line") + 
  ylab("Number of Organoids")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Avg.Org.Size, fill = Drug_Conc)) + 
  ggtitle(label = "Average Are of Organoids (Irinotecan / SN-38)") + 
  xlab("Cell Line") + ylab("Average Area")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Total.Area, fill = Drug_Conc)) + 
  ggtitle(label = "Total Biomass per Well (Irinotecan / SN-38)") + 
  xlab("Cell Line") + ylab("Total Biomass Area")

2.1.4 Volasertib

ggplotdf = simple_features[simple_features$Drug %in% c("DMSO", "Volasertib"),]
ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Num.Objects, fill = Drug_Conc)) + 
  ggtitle(label = "Number of Organoids per Well (Volasertib)") + xlab("Cell Line") + 
  ylab("Number of Organoids")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Avg.Org.Size, fill = Drug_Conc)) + 
  ggtitle(label = "Average Are of Organoids (Volasertib)") + 
  xlab("Cell Line") + ylab("Average Area")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Total.Area, fill = Drug_Conc)) + 
  ggtitle(label = "Total Biomass per Well (Volasertib)") + 
  xlab("Cell Line") + ylab("Total Biomass Area")

2.1.5 Methotrexate

ggplotdf = simple_features[simple_features$Drug %in% c("DMSO", "Methotrexate"),]
ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Num.Objects, fill = Drug_Conc)) + 
  ggtitle(label = "Number of Organoids per Well (Methotrexate)") + xlab("Cell Line") + 
  ylab("Number of Organoids")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Avg.Org.Size, fill = Drug_Conc)) + 
  ggtitle(label = "Average Are of Organoids (Methotrexate)") + 
  xlab("Cell Line") + ylab("Average Area")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Total.Area, fill = Drug_Conc)) + 
  ggtitle(label = "Total Biomass per Well (Methotrexate)") + 
  xlab("Cell Line") + ylab("Total Biomass Area")

2.2 Combining Positive Controls

With a few exceptions in individual cell lines, the highest concentrations of positive controls (“0.2” and “1”) seem to be well differentiable from the negative controls with regards to the number of organoids in each well. Grouping the positive controls leads to a larger within-group variance but still leaves them clearly differentiable from the negative controls.

non_stauro_pos_ctrls = c("Bortezomib", "Irinotecan / SN-38", "Volasertib", "Methotrexate")
valid_entries = features_metadata$DRUG %in% c("DMSO", "Staurosporine_500nM") | 
  (features_metadata$DRUG %in% non_stauro_pos_ctrls & features_metadata$CONCENTRATION %in% c("0.2", "1"))
features_combined = features[valid_entries,]
features_metadata_combined = features_metadata[valid_entries,]
features_metadata_combined$DRUG = ifelse(
  test = features_metadata_combined$DRUG == "DMSO",
  yes = "NEG_CTRL", no = "POS_CTRL")
simple_features = data.frame(
  "Num.Objects" = features_combined[,"organoids_num.of.objects"],
  "Avg.Org.Size" = features_combined[,"organoids_x.0.s.area_expected"],
  "Total.Area" = features_combined[,"Total.Biomass"],
  "Drug" = features_metadata_combined[,"DRUG"],
  "Cell.Line" = substr(rownames(features_combined), 1, 7)
)
ggplot(data = simple_features) + geom_boxplot(aes(x = Cell.Line, y = Num.Objects, fill = Drug)) +
  ggtitle(label = "Number of Organoids per Well",
          subtitle = "Grouped positive controls at concentrations of 1 and 0.2") + xlab("Cell Line") +
  ylab("Number of Organoids")

ggplot(data = simple_features) + geom_boxplot(aes(x = Cell.Line, y = Avg.Org.Size, fill = Drug)) +
  ggtitle(label = "Average Area of Organoids per Well",
          subtitle = "Grouped positive controls at concentrations of 1 and 0.2") + xlab("Cell Line") +
  ylab("Average Area of of Organoids")

ggplot(data = simple_features) + geom_boxplot(aes(x = Cell.Line, y = Total.Area, fill = Drug)) +
  ggtitle(label = "Total Area of Biomass per Well",
          subtitle = "Grouped positive controls at concentrations of 1 and 0.2") + xlab("Cell Line") +
  ylab("Total Area of Biomass")

2.3 Cell Line Quality

The average number of organoids as well as the total biomass are features that should vary strongly between the two controls. Consequently, I will exclude any cell lines for which the two controls cannot be properly separated. For this, I determine the z-factor for both the biomass and the number of organoids:

\[ Z^\prime = 1 - \frac{3 \cdot (\sigma_p + \sigma_n)}{|\mu_p - \mu_n|}\]

biomass_zprime = setNames(
  object = vector(mode="numeric", length=length(cell_lines)), 
  nm = cell_lines)
organoids_zprime = setNames(
  object = vector(mode="numeric", length=length(cell_lines)), 
  nm = cell_lines)
for(cell_line in cell_lines) {
  cl_simple_features = simple_features[simple_features$Cell.Line == cell_line, ]
  organoids_zprime[[cell_line]] = imageHTS::zprime(
    cl_simple_features[cl_simple_features$Drug == "NEG_CTRL", "Num.Objects"], 
    cl_simple_features[cl_simple_features$Drug == "POS_CTRL", "Num.Objects"], 
    method = "robust")
  biomass_zprime[[cell_line]] = imageHTS::zprime(
    cl_simple_features[cl_simple_features$Drug == "NEG_CTRL", "Total.Area"], 
    cl_simple_features[cl_simple_features$Drug == "POS_CTRL", "Total.Area"], 
    method = "robust")
}
## No methods found in "RSQLite" for requests: dbGetQuery
ggplotdf = data.frame(
  "Cell.Line" = names(organoids_zprime), 
  "Total.Biomass" = biomass_zprime, 
  "Num.Organoids" = organoids_zprime)
ggplotdf = melt(ggplotdf, id.vars = "Cell.Line")
ggplot(data = ggplotdf) + 
  geom_col(aes(x = Cell.Line, y = value, fill = variable), 
           position = position_dodge())

Usually, a value of Z’ \(\approx\) 0.5 indicates a good assay and a value of Z <= 0 indicates a bad assay. However, the two control populations are clearly separable, as shown by the wilcoxon test (a non-parametric test for determining if two samples come from different populations) for the number of organoids and total biomass.

wilcox_organoids_p = setNames(
  object = vector(mode="numeric", length=length(cell_lines)), 
  nm = cell_lines)
wilcox_biomass_p = setNames(
  object = vector(mode="numeric", length=length(cell_lines)), 
  nm = cell_lines)
for(cell_line in cell_lines) {
  cl_simple_features = simple_features[simple_features$Cell.Line == cell_line, ]
  wilcox_organoids_p[[cell_line]] = wilcox.test(
    cl_simple_features[cl_simple_features$Drug == "NEG_CTRL", "Num.Objects"], 
    cl_simple_features[cl_simple_features$Drug == "POS_CTRL", "Num.Objects"], 
    conf.int = TRUE)$p.value
  wilcox_biomass_p[[cell_line]] = wilcox.test(
    cl_simple_features[cl_simple_features$Drug == "NEG_CTRL", "Total.Area"], 
    cl_simple_features[cl_simple_features$Drug == "POS_CTRL", "Total.Area"], 
    conf.int = TRUE)$p.value
}

panderdf = data.frame(
  "Total.Biomass" = wilcox_biomass_p, 
  "Num.Objects" = wilcox_organoids_p)
pandoc.table(
  panderdf, emphasize.strong.cells = which(panderdf > 0.05, arr.ind = TRUE), 
  caption = "Probabilities that the feature values for the positive and negative controls come from the same distribution (p-values of Wilcoxon test)")
## 
## -------------------------------------------
##    &nbsp;      Total.Biomass   Num.Objects 
## ------------- --------------- -------------
##  **D004T01**     1.241e-17      1.429e-17  
## 
##  **D007T01**      3.3e-17       2.079e-17  
## 
##  **D010T01**     4.146e-17      8.739e-07  
## 
##  **D013T01**     1.564e-17      3.778e-16  
## 
##  **D015T01**    **0.1938**      2.185e-08  
## 
##  **D018T01**     1.241e-17      2.105e-15  
## 
##  **D019T01**     1.477e-17      1.313e-17  
## 
##  **D020T01**     2.943e-17      2.854e-17  
## 
##  **D021T01**     3.633e-14      3.78e-16   
## 
##  **D022T01**     1.586e-14      1.236e-17  
## 
##  **D027T01**     5.555e-09      6.846e-14  
## 
##  **D030T01**     1.241e-17      3.11e-17   
## -------------------------------------------
## 
## Table: Probabilities that the feature values for the positive and negative controls come from the same distribution (p-values of Wilcoxon test)

With the exception of D015T01, all cell lines show a significant separation between the positive and negative control samples with regards to the total biomass feature. The z-factor is apparently too strict. In fact, the z factor for nearly all features seems to indicate a bad assay quality:

# Calculate z factors for each cell line / feature combination
pos_ctrl = features_combined[features_metadata_combined$DRUG == "POS_CTRL",]
pos_ctrl_metadata = features_metadata_combined[features_metadata_combined$DRUG == "POS_CTRL",]
neg_ctrl = features_combined[features_metadata_combined$DRUG == "NEG_CTRL",]
neg_ctrl_metadata = features_metadata_combined[features_metadata_combined$DRUG == "NEG_CTRL",]
feature_z_factors = matrix(
  NA_real_, nrow = length(cell_lines), ncol = ncol(features_combined),
  dimnames = list(cell_lines, colnames(features_combined)))
for(feature in colnames(features_combined)) {
  for(cell_line in cell_lines) {
    feature_z_factors[cell_line, feature] = imageHTS::zprime(
      pos_ctrl[substr(rownames(pos_ctrl), 1, 7) == cell_line, feature],
      neg_ctrl[substr(rownames(neg_ctrl), 1, 7) == cell_line, feature],
      method = "robust")
  }
}

feature_z_factors_df = apply(feature_z_factors, 1, sort)
feature_z_factors_df = do.call(what = c, args = feature_z_factors_df)
feature_z_factors_df = data.frame(
  "Cell.Line" = sapply(strsplit(names(feature_z_factors_df), ".", fixed = TRUE), "[[", 1), 
  "ZFactor" = feature_z_factors_df)
feature_z_factors_df = feature_z_factors_df[is.finite(feature_z_factors_df$ZFactor),]
feature_z_factors_df$Features = NA_real_
for(cell_line in cell_lines) {
  feature_z_factors_df[
    feature_z_factors_df$Cell.Line == cell_line, 
    "Features"] = seq_len(table(
      feature_z_factors_df$Cell.Line)[cell_line])
}

ggplot(data = feature_z_factors_df) + geom_line(aes(x = Features, y = ZFactor, color = Cell.Line)) + 
  coord_cartesian(ylim = c(-1, 1)) + ggtitle(label = "Z-Factor for the features of each cell line")

The assay quality is ensured by the statistically significant separation of positive and negative controls with regard to the number of organoids and the total biomass for all cell lines except D015T01. I exclude this cell line from all further analyses.

cell_lines = cell_lines[cell_lines != "D015T01"]
features = features[substr(rownames(features), 1, 7) != "D015T01", ]
features_combined = features_combined[substr(rownames(features_combined), 1, 7) != "D015T01", ]
features_metadata = features_metadata[substr(rownames(features_metadata), 1, 7) != "D015T01", ]
features_metadata_combined = features_metadata_combined[substr(rownames(features_metadata_combined), 1, 7) != "D015T01", ]

2.4 Pruning Features

Features only make sense if they are sufficiently continuous. I want to discard features that are constant across the negative (DMSO) controls of a plate. A look at the ‘minimum ratio’ of unique values for each feature across plates shows that features are either continuous or constant across a plate. There is a very steep transition between the two states so that the actual threshold chosen is not so important. I keep only features with more than 50% unique values across all plates.

neg_ctrl = features_combined[features_metadata_combined$DRUG == "NEG_CTRL", ]
# This rounding ensures that there are no floating point arithmetic errors that are mistaken as unique values
neg_ctrl = round(neg_ctrl, 8)
neg_ctrl_cl = substr(rownames(neg_ctrl), 1, 14)
unique_vals = aggregate(neg_ctrl, list(neg_ctrl_cl), function(x) length(unique(x)) / length(x))
rownames(unique_vals) = unique_vals$Group.1
unique_vals$Group.1 = NULL
unique_vals_min = apply(unique_vals, 2, min)
unique_vals_min_df = data.frame(
  "Ratio.Unique.Vals" = sort(unique_vals_min, decreasing = TRUE), 
  "Features" = seq_along(unique_vals_min))
ggplot(data = unique_vals_min_df) + geom_point(aes(x = Features, y = Ratio.Unique.Vals)) + 
  ggtitle("Minimum Ratio of Unique Values for each Feature", 
          subtitle = "e.g. a 'minimum ratio' of 0.5 means all plates had a ratio of >= 0.5 of unique values for that feature")

features = features[,names(which(unique_vals_min >= 0.5))]
features_combined = features_combined[,names(which(unique_vals_min >= 0.5))]

2.5 Correlation between replicates

2.5.1 Cell line specfific

I look at all features now to see how well they correlate between replicates of the screen for each cell line individually

cl_correlations = setNames(
  object = vector(mode = "list", length = length(cell_lines)), 
  nm = cell_lines
)
for(cell_line in cell_lines) {
  cl_features = features[substr(rownames(features), 1, 7) == cell_line, ]
  cl_features_metadata = features_metadata[substr(rownames(features_metadata), 1, 7) == cell_line, ]
  rep1 = cl_features[cl_features_metadata$REPLICATE == 1, ]
  rep2 = cl_features[cl_features_metadata$REPLICATE == 2, ]
  if(!identical(paste0(substr(rownames(rep1), 1, 7), "_", substr(rownames(rep1), 12, 19)), 
                paste0(substr(rownames(rep2), 1, 7), "_", substr(rownames(rep2), 12, 19)))) {
    warning("Replicates are not in the same order!")
  }
  rep_correlations = setNames(
    object = vector(mode = "numeric", length = ncol(cl_features)), 
    nm = colnames(cl_features))
  for(feature in colnames(cl_features)) {
    rep_correlations[[feature]] = cor(
      rep1[[feature]], rep2[[feature]], 
      use = "pairwise.complete.obs")
  }
  rep_correlations = sort(rep_correlations, decreasing = TRUE)
  cl_correlations[[cell_line]] = data.frame(
    "Features" = seq_along(rep_correlations), 
    "Correlations" = rep_correlations, 
    "Cell.Line" = cell_line
  )
}

cl_correlations = do.call(rbind, cl_correlations)
ggplot(data = cl_correlations) + geom_line(aes(x = Features, y = Correlations, color = Cell.Line)) + 
  ggtitle("Correlations between Replicates for each Individual Cell Line")

2.5.2 Screen-wide

rep1 = features[features_metadata$REPLICATE == 1, ]
rep2 = features[features_metadata$REPLICATE == 2, ]
if(!identical(paste0(substr(rownames(rep1), 1, 7), "_", substr(rownames(rep1), 12, 19)), 
              paste0(substr(rownames(rep2), 1, 7), "_", substr(rownames(rep2), 12, 19)))) {
  warning("Replicates are not in the same order!")
}

rep_correlations = setNames(
  object = vector(mode = "numeric", length = ncol(features)), 
  nm = colnames(features))
for(feature in colnames(features)) {
  rep_correlations[[feature]] = cor(
    rep1[[feature]], rep2[[feature]], 
    use = "pairwise.complete.obs")
}

ggplotdf = data.frame(
  "Features" = seq_along(sort(rep_correlations)),
  "Correlation" = sort(rep_correlations, decreasing = TRUE))
ggplot(data = ggplotdf) + geom_line(aes(x = Features, y = Correlation)) + 
  ggtitle("Correlations between Replicates across all Cell Lines")